Using the ‘PWSonly’ sites (~770k snps) Running PCAngsd requires several steps
#first create ped/bed files with adding variant id
module load plink
plink --vcf /home/ktist/ph/data/new_vcf/MD7000/PWSonly_NS0.5_maf05.vcf.gz --set-missing-var-ids @:#[ph]\\$r,\\$a --make-bed --out /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_NS0.5_maf05
plink --bfile /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_NS0.5_maf05 --recode --tab --out /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_NS0.5_maf05
#find highly correlated sites for pruning
plink --file /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_NS0.5_maf05 --indep-pairwise 75'kb' 5 0.5 --out /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_NS0.5_maf05_75_5_0.5
#50kb does not make much difference
plink --file /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_NS0.5_maf05 --indep-pairwise 50'kb' 5 0.5 --out /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_NS0.5_maf05_50_5_0.5
# See comparison results in Data/PCAngsd/pruning_parameter_difference.txt
#Reformat prun.in file with running the reformat_prunin.R
R
source("reformat_prunin.R")
reformat_prunin("ph/data/new_vcf/MD7000/plinkfiles/PWSonly_NS0.5_maf05_75_5_0.5.prune.in")
quit()#create vcf files with only prune.in sites
#index the vcf first
bcftools index home/ktist/ph/data/new_vcf/MD7000/PWSonly_NS0.5_maf05.vcf.gz
bcftools view -R /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_NS0.5_maf05_75_5_0.5.prune.in.sites.txt /home/ktist/ph/data/new_vcf/MD7000/PWSonly_NS0.5_maf05.vcf.gz > /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_maf05_pruned.vcf
bgzip /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_maf05_pruned.vcf
#Create beagle files (create_beagle_PWS.sh)
#e.g.
vcftools --gzvcf /home/ktist/ph/data/new_vcf/MD7000/plinkfiles/PWSonly_maf05_pruned.vcf.gz --chr chr1 --out /home/ktist/ph/data/new_vcf/MD7000/beagle/PWSonly_pruned_c1 --BEAGLE-PL
gzip /home/ktist/ph/data/new_vcf/MD7000/beagle/PWSonly_pruned_c1.BEAGLE.PL
#Run PCAngsd (pcangsd_pws.sh)
#e.g.
python /home/jamcgirr/apps/pcangsd/pcangsd.py -beagle /home/ktist/ph/data/new_vcf/MD7000/beagle/PWSonly_pruned_c1.BEAGLE.PL.gz -o /home/ktist/ph/data/angsd/PCAngsd/PWSonly_maf05_chr1 -threads 16 pop_info<-read.csv("../Data/Sample_metadata_892pops.csv")
pop_info<-pop_info[,c("Sample","Population.Year","pop","Year.Collected")]
colnames(pop_info)[4]<-"year"
pops<-pop_info[pop_info$pop=="PWS",]
pcols<-c("#d7b5d8","#df65b0","#dd1c77","#980043")
Plots<-list()
chr<-paste0("chr", c(1:23,25:26) ) #exclude chr24
for (i in 1:length(chr)){
C <- as.matrix(read.table(paste0("../Data/PCAangsd/PWSonly_maf05_",chr[i],".cov")))
e <- eigen(C)
pca <-data.frame(Sample=pops$Sample,
pop=pops$pop,
year=pops$year,
PC1=e$vectors[,1],PC2=e$vectors[,2],
PC3=e$vectors[,3],PC4=e$vectors[,4],
PC5=e$vectors[,5],PC6=e$vectors[,6],
PC7=e$vectors[,7],PC8=e$vectors[,8],
stringsAsFactors=FALSE)
prop_explained <- c()
for (s in e$values[1:10]) {
#print(s / sum(e$values))
prop_explained <- c(prop_explained,round(((s / sum(e$values))*100),2))
}
#write.csv(pca, paste0("../Output/PCA/PWSonly_pca_",chr[i],".csv"), row.names = F)
Plots[[i]]<-ggplot()+
geom_point(data = pca, aes(x = PC1, y = PC2, fill = factor(year), color = factor(year), shape = factor(year)), size = 3)+
scale_shape_manual(values=c(23,25,3,21), guide="none")+
scale_fill_manual(values=paste0(pcols,"99"), guide="none")+
scale_color_manual(values=pcols[1:4])+
xlab(paste("PC 1: ", prop_explained[1],"%\n",sep = ""))+
ylab(paste("PC 2: ", prop_explained[2],"%\n",sep = ""))+
theme_bw()+
ggtitle(paste0(chr[i]))+
guides(shape = guide_legend(override.aes =c(23,25,3,21)), fill=guide_legend(override.aes =paste0(pcols,"99")))+theme(legend.title = element_blank())
}
{pdf("../Output/PCA/PWSonly_PCA_byChromosome.pdf",width = 35, height = 30)
do.call(grid.arrange, c(Plots, ncol=5))
dev.off()
}
g <- arrangeGrob(do.call(grid.arrange, c(Plots, ncol=5)))
ggsave(g, file="../Output/PCA/PWSonly_PCA_byChromosome.png",width = 35, height = 30)